Within these data, lies important patterns and variables. Find out some interesting insights from the data provided.
import pandas as pd # for reading the file
import numpy as np # for numericals & mathematicals experssion
import seaborn as sns # for data visulization
import matplotlib.pyplot as plt # for data visulization
%matplotlib inline
import time
import warnings
warnings.filterwarnings('ignore')
pip install pandas-profiling
Requirement already satisfied: pandas-profiling in c:\users\dell\anaconda3\lib\site-packages (3.1.0) Requirement already satisfied: htmlmin>=0.1.12 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (0.1.12) Requirement already satisfied: seaborn>=0.10.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (0.11.2) Requirement already satisfied: pydantic>=1.8.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (1.8.2) Requirement already satisfied: scipy>=1.4.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (1.7.1) Requirement already satisfied: joblib~=1.0.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (1.0.1) Requirement already satisfied: matplotlib>=3.2.0 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (3.4.3) Requirement already satisfied: tangled-up-in-unicode==0.1.0 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (0.1.0) Requirement already satisfied: requests>=2.24.0 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (2.26.0) Requirement already satisfied: phik>=0.11.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (0.12.0) Requirement already satisfied: markupsafe~=2.0.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (2.0.1) Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (1.3.4) Requirement already satisfied: numpy>=1.16.0 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (1.20.3) Requirement already satisfied: missingno>=0.4.2 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (0.5.0) Requirement already satisfied: jinja2>=2.11.1 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (2.11.3) Requirement already satisfied: tqdm>=4.48.2 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (4.62.3) Requirement already satisfied: multimethod>=1.4 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (1.6) Requirement already satisfied: visions[type_image_path]==0.7.4 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (0.7.4) Requirement already satisfied: PyYAML>=5.0.0 in c:\users\dell\anaconda3\lib\site-packages (from pandas-profiling) (6.0) Requirement already satisfied: attrs>=19.3.0 in c:\users\dell\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (21.2.0) Requirement already satisfied: networkx>=2.4 in c:\users\dell\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (2.6.3) Requirement already satisfied: imagehash in c:\users\dell\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (4.2.1) Requirement already satisfied: Pillow in c:\users\dell\anaconda3\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (8.4.0) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (3.0.4) Requirement already satisfied: python-dateutil>=2.7 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (2.8.2) Requirement already satisfied: cycler>=0.10 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (0.10.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\dell\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (1.3.1) Requirement already satisfied: six in c:\users\dell\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling) (1.16.0) Requirement already satisfied: pytz>=2017.3 in c:\users\dell\anaconda3\lib\site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3->pandas-profiling) (2021.3) Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\users\dell\anaconda3\lib\site-packages (from pydantic>=1.8.1->pandas-profiling) (3.10.0.2) Requirement already satisfied: certifi>=2017.4.17 in c:\users\dell\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling) (2021.10.8) Requirement already satisfied: charset-normalizer~=2.0.0 in c:\users\dell\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling) (2.0.4) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\dell\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling) (1.26.7) Requirement already satisfied: idna<4,>=2.5 in c:\users\dell\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling) (3.2) Requirement already satisfied: colorama in c:\users\dell\anaconda3\lib\site-packages (from tqdm>=4.48.2->pandas-profiling) (0.4.4) Requirement already satisfied: PyWavelets in c:\users\dell\anaconda3\lib\site-packages (from imagehash->visions[type_image_path]==0.7.4->pandas-profiling) (1.1.1) Note: you may need to restart the kernel to use updated packages.
data = pd.read_csv("Heart_Attack_Analysis_task4.csv")
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
profile
data.isnull().describe()
| age | sex | cp | trtbps | chol | fbs | restecg | thalachh | exng | oldpeak | slp | caa | thall | output | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 |
| unique | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| top | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| freq | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 | 303 |
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data.shape
(302, 14)
fig, ax = plt.subplots(ncols = 2, nrows = 7, figsize = (27, 60))
index = 0
ax = ax.flatten()
for cols, value in data.items():
if cols != 'type':
sns.boxplot(y=cols,data = data,ax=ax[index])
index += 1
plt.tight_layout(pad = 0.5, w_pad = 0.7, h_pad = 5.0)
plt.show()
# chol outlier - 85 index value
data[data['chol']>390]
| age | sex | cp | trtbps | chol | fbs | restecg | thalachh | exng | oldpeak | slp | caa | thall | output | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 28 | 65 | 0 | 2 | 140 | 417 | 1 | 0 | 157 | 0 | 0.8 | 2 | 1 | 2 | 1 |
| 85 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 1 |
| 96 | 62 | 0 | 0 | 140 | 394 | 0 | 0 | 157 | 0 | 1.2 | 1 | 0 | 2 | 1 |
| 219 | 63 | 0 | 0 | 150 | 407 | 0 | 0 | 154 | 0 | 4.0 | 1 | 3 | 3 | 0 |
| 245 | 56 | 0 | 0 | 134 | 409 | 0 | 0 | 150 | 1 | 1.9 | 1 | 2 | 3 | 0 |
# thalachh outlier - 271 index value
data[data["thalachh"]<80]
| age | sex | cp | trtbps | chol | fbs | restecg | thalachh | exng | oldpeak | slp | caa | thall | output | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 271 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 0 |
# oldpeak outlier - 203, 220 index value
data[data["oldpeak"]>5]
| age | sex | cp | trtbps | chol | fbs | restecg | thalachh | exng | oldpeak | slp | caa | thall | output | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 203 | 62 | 0 | 0 | 160 | 164 | 0 | 0 | 145 | 0 | 6.2 | 0 | 3 | 3 | 0 |
| 220 | 55 | 1 | 0 | 140 | 217 | 0 | 1 | 111 | 1 | 5.6 | 0 | 0 | 3 | 0 |
data.drop(data.index[[85,203,220,271]], inplace = True)
data.shape
(299, 14)
fig, ax = plt.subplots(ncols = 2, nrows = 7, figsize = (27, 60))
index = 0
ax = ax.flatten()
for col, value in data.items():
if col != 'type':
sns.distplot(value, ax = ax[index])
index += 1
plt.tight_layout(pad = 0.5, w_pad = 0.7, h_pad = 5.0)
sns.set_style('darkgrid')
sns.distplot(data['age'],bins=30,kde=True)
plt.title('Age Distribution of Patients',size=15)
plt.xlabel('Age',size=12);
numerical_cols = ['age', 'trtbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'caa', 'fbs', 'restecg', 'exng', 'slp', 'thall']
target = ['output']
plt.figure(figsize=(12,6))
sns.set(font_scale=1.4)
sns.set_style('whitegrid')
sns.countplot(x='output', data=data, palette='magma').set(xlabel="Output")
plt.title("Countplot of People with less or more chance of heart attack")
Text(0.5, 1.0, 'Countplot of People with less or more chance of heart attack')
print("Number of people with more chance of heart attack:", data[data["output"]==1].value_counts().sum())
print("Number of people with less chance of heart attack:",data[data["output"]==0].value_counts().sum())
Number of people with more chance of heart attack: 164 Number of people with less chance of heart attack: 135
sns.countplot(x='sex', data=data, palette='magma')
plt.title("Countplot of Sex")
Text(0.5, 1.0, 'Countplot of Sex')
print("Number of people with sex labelled as 1:", data[data["sex"]==1].value_counts().sum())
print("Number of people with sex labelled as 0:",data[data["sex"]==0].value_counts().sum())
Number of people with sex labelled as 1: 204 Number of people with sex labelled as 0: 94
fig, ax = plt.subplots(2,3, figsize=(20,18))
sns.countplot(x='fbs', data=data, palette='magma', ax=ax[0][0]).set(title='Fasting Blood Sugar')
sns.countplot(x='exng', data=data, palette='magma', ax=ax[0][1]).set(title='Exercise Induced Angina')
sns.countplot(x='restecg', data=data, palette='magma', ax=ax[1][0]).set(title='Rest ECG')
sns.countplot(x='cp', data=data, palette='magma', ax=ax[0][2]).set(title='Chest Pain Type')
sns.countplot(x='caa', data=data, palette='magma', ax=ax[1][1]).set(title='Number of major vessels')
sns.countplot(x='thall', data=data, palette='magma', ax=ax[1][2]).set(title='Thallium Stress Test')
[Text(0.5, 1.0, 'Thallium Stress Test')]
fig, ax = plt.subplots(2,2, figsize=(20,18))
sns.histplot(x=data["age"], ax=ax[0][0], color="blue", kde=True).set(title='Age')
sns.histplot(x=data["trtbps"], ax=ax[0][1], color="blue", kde=True).set(title='Resting Blood Pressure')
sns.histplot(x=data["chol"], ax=ax[1][0], color="green", kde=True).set(title='Cholestrol Levels')
sns.histplot(x=data["thalachh"], ax=ax[1][1], color="green", kde=True).set(title='Maximum Heart Rate Achieved')
[Text(0.5, 1.0, 'Maximum Heart Rate Achieved')]
sns.countplot(x='output',data=data,palette='muted',hue='cp')
plt.title('Number of No Heart Attacks vs. Heart Attacks by Chest Pain type',size=13)
plt.xlabel('Patients',size=12)
plt.ylabel('Count',size=12);
data.corr()
| age | sex | cp | trtbps | chol | fbs | restecg | thalachh | exng | oldpeak | slp | caa | thall | output | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.088817 | -0.061404 | 0.290154 | 0.199493 | 0.125787 | -0.110029 | -0.393208 | 0.102705 | 0.204387 | -0.155231 | 0.308477 | 0.059918 | -0.220631 |
| sex | -0.088817 | 1.000000 | -0.047904 | -0.054817 | -0.186256 | 0.044262 | -0.075261 | -0.034871 | 0.137000 | 0.122017 | -0.041790 | 0.126330 | 0.224448 | -0.284103 |
| cp | -0.061404 | -0.047904 | 1.000000 | 0.054810 | -0.108550 | 0.094171 | 0.047954 | 0.284079 | -0.395201 | -0.131025 | 0.105434 | -0.193687 | -0.160748 | 0.424523 |
| trtbps | 0.290154 | -0.054817 | 0.054810 | 1.000000 | 0.166258 | 0.180277 | -0.114064 | -0.052007 | 0.067141 | 0.177908 | -0.113409 | 0.085949 | 0.058252 | -0.140193 |
| chol | 0.199493 | -0.186256 | -0.108550 | 0.166258 | 1.000000 | 0.018035 | -0.140873 | -0.022853 | 0.083189 | 0.081133 | -0.003168 | 0.121369 | 0.088353 | -0.117504 |
| fbs | 0.125787 | 0.044262 | 0.094171 | 0.180277 | 0.018035 | 1.000000 | -0.083970 | -0.014381 | 0.023966 | 0.018019 | -0.068208 | 0.146917 | -0.028985 | -0.030389 |
| restecg | -0.110029 | -0.075261 | 0.047954 | -0.114064 | -0.140873 | -0.083970 | 1.000000 | 0.059180 | -0.076541 | -0.055173 | 0.091684 | -0.074963 | -0.004903 | 0.142710 |
| thalachh | -0.393208 | -0.034871 | 0.284079 | -0.052007 | -0.022853 | -0.014381 | 0.059180 | 1.000000 | -0.388779 | -0.347358 | 0.380415 | -0.246724 | -0.098417 | 0.412548 |
| exng | 0.102705 | 0.137000 | -0.395201 | 0.067141 | 0.083189 | 0.023966 | -0.076541 | -0.388779 | 1.000000 | 0.299410 | -0.260003 | 0.132958 | 0.207123 | -0.439143 |
| oldpeak | 0.204387 | 0.122017 | -0.131025 | 0.177908 | 0.081133 | 0.018019 | -0.055173 | -0.347358 | 0.299410 | 1.000000 | -0.555496 | 0.229898 | 0.188596 | -0.429426 |
| slp | -0.155231 | -0.041790 | 0.105434 | -0.113409 | -0.003168 | -0.068208 | 0.091684 | 0.380415 | -0.260003 | -0.555496 | 1.000000 | -0.086155 | -0.087206 | 0.335765 |
| caa | 0.308477 | 0.126330 | -0.193687 | 0.085949 | 0.121369 | 0.146917 | -0.074963 | -0.246724 | 0.132958 | 0.229898 | -0.086155 | 1.000000 | 0.158570 | -0.411355 |
| thall | 0.059918 | 0.224448 | -0.160748 | 0.058252 | 0.088353 | -0.028985 | -0.004903 | -0.098417 | 0.207123 | 0.188596 | -0.087206 | 0.158570 | 1.000000 | -0.345042 |
| output | -0.220631 | -0.284103 | 0.424523 | -0.140193 | -0.117504 | -0.030389 | 0.142710 | 0.412548 | -0.439143 | -0.429426 | 0.335765 | -0.411355 | -0.345042 | 1.000000 |
plt.figure(figsize = (35,15))
plt.title('Correlation of all the Columns', fontsize = 20)
sns.heatmap(data.corr(), annot = True, vmin = -1, vmax = 1, center = 0, fmt = '.1g', linewidths = 1, linecolor = 'white',
square = True, cmap ='RdBu')
<AxesSubplot:title={'center':'Correlation of all the Columns'}>
fig, ax = plt.subplots(2,3, figsize=(22,18))
sns.kdeplot(x="age", data=data, hue="output", ax=ax[0][0], fill="True", palette="magma").set(title="Heart Attack related to Age")
sns.kdeplot(x="cp", data=data, hue="output", ax=ax[0][1], fill="True", palette="viridis").set(title="Heart Attack related to Chest Pain Types")
sns.kdeplot(x="thalachh", data=data, hue="output", ax=ax[1][0], fill="True", palette="viridis").set(title="Heart Attack related to Heart Rate")
sns.kdeplot(x="chol", data=data, hue="output", ax=ax[1][1], fill="True", palette="magma").set(title="Heart Attack related to Cholestrol")
sns.kdeplot(x="thall", data=data, hue="output", ax=ax[0][2], fill="True", palette="magma").set(title="Heart Attack related to Thallium Stress Test")
sns.kdeplot(x="trtbps", data=data, hue="output", ax=ax[1][2], fill="True", palette="viridis").set(title="Heart Attack related to Blood Pressure")
[Text(0.5, 1.0, 'Heart Attack related to Blood Pressure')]
fig, ax = plt.subplots(2,2, figsize=(18, 15))
sns.boxplot(x="thall", y="thalachh", data=data, palette="magma", ax=ax[0][0]).set(title="Thallium Stress Test vs Max Heart Rate")
sns.boxplot(x="thall", y="trtbps", data=data, palette="viridis", ax=ax[0][1]).set(title="Thallium Stress Test vs Resting Blood Pressure")
sns.boxplot(x="thall", y="chol", data=data, palette="viridis", ax=ax[1][0]).set(title="Chest Pain Type vs Age")
sns.boxplot(x="cp", y="age", data=data, palette="magma", ax=ax[1][1]).set(title="Chest Pain Type vs Age")
[Text(0.5, 1.0, 'Chest Pain Type vs Age')]